In [1]:
import numpy as np
import matplotlib.pyplot as plt
def sigmoid(z):
return np.divide(1, 1 + np.exp( np.multiply(-1, z) ))
z = np.array([1, 2, 3, -1])
print 'z =', z
print 'sigmoid(z) =', sigmoid(z)
z = np.linspace(-5, 5, 20)
plt.plot(z, sigmoid(z))
plt.show()
Suppose we are given the following:
We train a three layer neural network with:
We define the following:
In [2]:
class NeuralNetwork:
def __init__(self, A, I, H, O):
self.A = A
self.W = np.multiply(np.random.normal(-0.001, 0.001, A.shape), A)
self.I = I
self.H = H
self.O = O
self.M = I+H+O
assert self.A.shape[0] == self.A.shape[1]
assert self.A.shape[0] == I+H+O
print 'creating network with', I, 'input units,', H, 'hidden units,', 'and', O, 'output units'
For some given examples $exampleX$, we initialize the pre-nonlinearity matrix $X$ and the post-nonlinearity matrix $Y$ as follows:
In [3]:
def _create_X_g(self, example_X):
T = example_X.shape[0]
X = np.zeros((T, self.M))
# The inbound activity of the bias is not needed
X[:, 0] = 1
# The inbound activity of a input unit is the input
X[:, 1:self.I] = example_X
g = np.zeros(X.shape)
# The outbound activity of the bias is always 1
g[:, 0] = 1
# The outbound activity of a input unit is the input
g[:, 1:self.I] = example_X
return X, g
In forward propagation, we compute unit by unit (in this case, layer by layer) the outbound post-nonlinearity activities.
In general, suppose $x_j$ is the pre-linearity activity of unit $j$, then the post-linearity activity of unit $j$, $g_j(x_j)$ is computed as:
$$ g_j(x_j) = \frac{1}{1 + e^{x_j}} $$for logistic units, and
$$ g_j(x_j) = x_j $$for linear units, where
$$ x_j = \sum^{j-1}_{k=1}W_{kj}g_k(x_k) $$Here we use $W_{kj}$ to denote the weight of the connection from unit $k$ to unit $j$. For input units $x_i$, $g_i(x_i)$ is taken to be linear.
In [4]:
def forward_propagate(self, X, g):
for j in self.hidden_units():
# the pre-nonlinearity activity is equal to the weighted sum of the inbound post-nonlinearity activities
X[:, j] = np.dot(g, self.W[:,j])
# the post-nonlinearity activity is equal to the sigmoidal output of the pre-nonlinearity activity
g[:, j] = self.sigmoid( X[:,j] )
for j in self.output_units():
X[:, j] = np.dot(g, self.W[:,j]) # same as above
g[:, j] = X[:, j] # linear output units
return X, g
The goal of backpropagation is to adjust weights based on the error derivatives. We begin by defining the cost function as the sum of the squared errors at the output units:
$$ J = \frac{1}{2} \sum^{M}_{j = I+H+1}( g_j(x_j) - y_j )^2 $$where $y_j$ represents the target value at the output unit $j$.
We are looking for how the cost function changes with respect to each weight:
$$ \frac{\partial J}{\partial W_{lm}} = \frac{\partial J}{\partial x_m} \frac{\partial x_m}{\partial W_{lm}} $$In particular:
$$ \frac{\partial x_m}{\partial W_{lm}} = \frac{ \partial }{\partial W_{lm}} \sum_{k=1}^{m-1} W_{km}g_k(x_k) = g_l(x_l) $$We now derive the remaining term $\frac{\partial J}{\partial x_m}$
For output units $x_m$, we have the following:
$$ \frac{\partial J}{\partial x_m} = \frac{\partial}{\partial x_m} \frac{1}{2} \sum^{M}_{j = I+H+1}( g_j(x_j) - y_j )^2 = \frac{\partial}{\partial x_m} \frac{1}{2} ( g_m(x_m) - y_m )^2 = ( g_m(x_m) - y_m ) g_m'(x_m) = g_m'(x_m) ( g_m(x_m) - y_m ) $$For hidden units $x_m$, we have the following:
$$ \frac{\partial J}{\partial x_m} = \sum_{k = m+1}^{M} \frac{\partial J}{\partial x_k} \frac{\partial x_k}{\partial x_m} $$In particular:
$$ \frac{\partial x_k}{\partial x_m} = \frac{\partial}{\partial x_m} \sum_{a = 1}^{a = k-1} W_{ak}g_a(x_a) = W_{mk} g_m'(x_m) $$Hence for hidden units $x_m$, we have the following:
$$ \frac{\partial J}{\partial x_m} = \sum_{k = m+1}^{M} \frac{\partial J}{\partial x_k} W_{mk} g_m'(x_m) = g_m'(x_m) \sum_{k = m+1}^{M} \frac{\partial J}{\partial x_k} W_{mk} $$The backpropagation algorithm is then as follows:
For output units $m=M$ down to $m=I+H+1$:
$$ \frac{\partial J}{\partial x_m} = g_m'(x_m) ( g_m(x_m) - y_m ) $$For hidden units $m=I+H$ down to $m=I+1$:
$$ \frac{\partial J}{\partial x_m} = g_m'(x_m) \sum_{k = m+1}^{M} \frac{\partial J}{\partial x_k} W_{mk} $$For non-input units $m=I+H+1$ up to $M$ and for non-output units $l=1$ up to $I+H$:
$$ \frac{\partial J}{\partial W_{lm}} = g_l(x_l) \frac{\partial J}{\partial x_m} $$
In [5]:
def backpropagate(self, X, Y, g):
T = X.shape[0]
# initialize the error derivatives to be zero
dJ_dX = np.zeros((T, self.M))
for m in reversed(self.output_units()):
dJ_dX[:, self.I+self.H:self.M] = g[:, self.I+self.H:self.M] - Y
for m in reversed(self.hidden_units()):
gprime = np.multiply(g[:,m], 1-g[:,m])
dJ_dX[:, m] = np.multiply( np.dot(dJ_dX[:, m+1:self.M], np.transpose(self.W[m, m+1:self.M])), gprime)
dJ_dW = np.dot(np.transpose(g), dJ_dX)
dJ_dW = np.multiply(dJ_dW, self.A) / T
return dJ_dW
The training algorithm is then as follows:
Repeat until convergence For each traning case $t$:
Forward propagate to obtain all estimates $X$ and post-nonlinearity $g$
Backpropagate to obtain all error derivatives $\frac{\partial J}{\partial W_{lm}}$
Update weights
Where we update the weights as follows:
$$ W_{lm} \leftarrow W_{lm} - \eta \frac{\partial J}{\partial W_{lm}} $$
In [6]:
def compute_cost(target, guess):
temp = target - guess
return np.mean(np.multiply(temp, temp))
In [7]:
def train(self, example_X, Y, learning_rate, num_iterations):
X, g = self._create_X_g(example_X)
T = X.shape[0]
print 'training using', T, 'examples and', num_iterations, 'iterations with learning rate of', learning_rate
costs = np.zeros(num_iterations)
iteration = 0
while (iteration < num_iterations):
X, g = self.forward_propagate(X, g)
costs[iteration] = self.compute_cost(Y, np.reshape(g[:, -1], (np.size(g[:, -1]), 1)))
dJ_dW = self.backpropagate(X, Y, g)
self.W = self.W - np.multiply(learning_rate, dJ_dW)
iteration += 1
return g, costs
In [8]:
class NeuralNetwork:
def sigmoid(self, z):
return np.divide(1, 1 + np.exp( np.multiply(-1, z) ))
def __init__(self, A, I, H, O):
self.A = A
self.W = np.multiply(np.random.normal(-0.001, 0.001, A.shape), A)
self.I = I
self.H = H
self.O = O
self.M = I+H+O
assert self.A.shape[0] == self.A.shape[1]
assert self.A.shape[0] == I+H+O
print 'creating network with', I, 'input units,', H, 'hidden units,', 'and', O, 'output units'
def hidden_units(self):
return range(self.I, self.I+self.H)
def output_units(self):
return range(self.I+self.H, self.M)
def _create_X_g(self, example_X):
T = example_X.shape[0]
X = np.zeros((T, self.M))
# The inbound activity of the bias is not needed
X[:, 0] = 1
# The inbound activity of a input unit is the input
X[:, 1:self.I] = example_X
g = np.zeros(X.shape)
# The outbound activity of the bias is always 1
g[:, 0] = 1
# The outbound activity of a input unit is the input
g[:, 1:self.I] = example_X
return X, g
def forward_propagate(self, X, g):
for j in self.hidden_units():
# the pre-nonlinearity activity is equal to the weighted sum of the inbound post-nonlinearity activities
X[:, j] = np.dot(g, self.W[:,j])
# the post-nonlinearity activity is equal to the sigmoidal output of the pre-nonlinearity activity
g[:, j] = self.sigmoid( X[:,j] )
for j in self.output_units():
X[:, j] = np.dot(g, self.W[:,j]) # same as above
g[:, j] = X[:, j] # linear output units
return X, g
def backpropagate(self, X, Y, g):
T = X.shape[0]
# initialize the error derivatives to be zero
dJ_dX = np.zeros((T, self.M))
for m in reversed(self.output_units()):
dJ_dX[:, self.I+self.H:self.M] = g[:, self.I+self.H:self.M] - Y
for m in reversed(self.hidden_units()):
gprime = np.multiply(g[:,m], 1-g[:,m])
dJ_dX[:, m] = np.multiply( np.dot(dJ_dX[:, m+1:self.M], np.transpose(self.W[m, m+1:self.M])), gprime)
dJ_dW = np.dot(np.transpose(g), dJ_dX)
dJ_dW = np.multiply(dJ_dW, self.A) / T
return dJ_dW
def compute_cost(self, target, guess):
temp = target - guess
return np.mean(np.multiply(temp, temp))
def train(self, example_X, Y, learning_rate, num_iterations):
X, g = self._create_X_g(example_X)
T = X.shape[0]
print 'training using', T, 'examples and', num_iterations, 'iterations with learning rate of', learning_rate
costs = np.zeros(num_iterations)
iteration = 0
while (iteration < num_iterations):
X, g = self.forward_propagate(X, g)
costs[iteration] = self.compute_cost(Y, np.reshape(g[:, -1], (np.size(g[:, -1]), 1)))
dJ_dW = self.backpropagate(X, Y, g)
self.W = self.W - np.multiply(learning_rate, dJ_dW)
iteration += 1
return g, costs
In [9]:
X = np.arange(0, 2*np.pi, 0.1)
Y = np.sin(X)
X = np.reshape(X, (np.size(X), 1))
Y = np.reshape(Y, (np.size(Y), 1))
plt.scatter(X, Y)
plt.show()
In [12]:
A = np.array([
[0., 0., 1., 1., 1.],
[0., 0., 1., 1., 0.],
[0., 0., 0., 0., 1.],
[0., 0., 0., 0., 1.],
[0., 0., 0., 0., 0.]])
nn = NeuralNetwork(A, 2, 2, 1)
g, costs = nn.train(X,Y, learning_rate=0.02, num_iterations=50000)
plt.plot(costs)
plt.title('cost vs. iteration')
plt.show()
plt.scatter(X, g[:, -1], color='k')
plt.scatter(X,Y,color='g')
plt.show()
In [ ]: